# Libraries
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from IPython.display import SVG
from graphviz import Source
from IPython.display import display
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.tree import export_graphviz
import graphviz
from sklearn.metrics import mean_squared_error
import warnings
from math import sqrt
input_file = r'C:\Users\mitikirim\Documents\Py\credit.csv'
credit = pd.read_csv(input_file)
credit.shape
#Explore data
credit.head(4)
# Target variable
target = credit['Creditability']
target.value_counts()
# Split: Method 1
random.seed(12345)
indx = random.sample(range(0, 1000), 1000)
credit_rand = credit.iloc[indx]
target_rand = target.iloc[indx]
credit_rand.head(5)
credit_rand.describe()
credit_train = credit_rand.iloc[0:700]
credit_test = credit_rand.iloc[700:1000]
target_train = target_rand.iloc[0:700]
target_test = target_rand.iloc[700:1000]
target_train.value_counts()/700
target_test.value_counts()/300
# Split: Method 2
y = target
X = credit.drop(['Creditability'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,
random_state=52)
# Design decision tree
model = tree.DecisionTreeClassifier()
model = model.fit(X_train, y_train)
graph = Source(tree.export_graphviz(model, out_file=None
, feature_names=X.columns, class_names=['default', 'no default']
, filled = True))
display(SVG(graph.pipe(format='svg')))
y_predict = model.predict(X_test)
print(confusion_matrix(y_test, y_predict))
print(accuracy_score(y_test, y_predict)*100)
Ans We do not see a accuracy of 100% here. The accuracy we see here is about 68%. This means our model correctly predicts outcome variable 68% of the times. An accuracy of 100% does not always mean we have a perfect model. The same model could produce less accuracy on different dataset. If we have a accuracy of 100%, that means our classification problem is a easy one. In real world, this is seldom true. By changing test set and train set proportions, we may get different results. Also, the model may not perform as accurately on unseen data. This can be verified by out of sample AUC. In addition, the sample may not be always randomly distributed.
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
print(confusion_matrix(y_test, y_predict))
warnings.filterwarnings('ignore')
print(accuracy_score(y_test, y_predict)*100)
feature_importances = pd.DataFrame(clf.feature_importances_,
columns=['importance']).sort_values('importance', ascending=False)
feature_importances
Three most important features of the model are: Credit Amount, Duration of credit (month) and Age(years). This is true since more the Duration of the credit, more is the amount of data available, more the credit, more is the usage.
X_train.columns.values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,
random_state=23458)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
print(confusion_matrix(y_test, y_predict))
warnings.filterwarnings('ignore')
print(accuracy_score(y_test, y_predict)*100)
input_file = r"C:\Users\mitikirim\Documents\Py\whitewines.csv"
wine = pd.read_csv(input_file)
wine.head(5)
n, bins, patches = plt.hist(x=wine['quality'], bins='auto', color='b', )
plt.xlabel('Quallity Value')
plt.ylabel('Frequency')
plt.show()
# Split the data
target = wine['quality']
target.value_counts()
y = target
X = wine.drop(['quality'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
random_state=52)
model = tree.DecisionTreeClassifier()
model = model.fit(X_train, y_train)
export_graphviz(model, out_file =r'C:\Users\mitikirim\Documents\Py\tree.dot', feature_names =X.columns)
dot_data = tree.export_graphviz(model, out_file=None,
feature_names=X.columns,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph
y_predict = model.predict(X_test)
print(confusion_matrix(y_test, y_predict))
print(accuracy_score(y_test, y_predict)*100)
np.corrcoef(y_test,y_predict)
sqrt(mean_squared_error(y_test,y_predict))
Root Mean Squared Error is a measure to evaluate error rate of a model. Often times, our output does not perfectly fit the input. This deviation of output from input points is called error and one way of measuring it is by RMSE given the units of both inut and output are the same. Also, it is inversely proportional to correlation coefficient of the model. If the correlation coefficient is 1 then the RMSE is 0. Here we have a correlation coefficient of about 56%. And the RMSE is about 81%. This implies, the current model for the given dataset is a good one.
input_news = r"C:\Users\mitikirim\Documents\Py\OnlineNewsPopularity_for_python.csv"
news = pd.read_csv(input_news)
news.head(6)
# Pre-processing
popular = news.shares >= 1400
unpopular = news.shares < 1400
news.loc[popular,'shares'] = 1
news.loc[unpopular,'shares'] = 0
target = news['shares']
target.value_counts()
y = target
X = news.drop(['url','shares'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30,
random_state=9)
y.value_counts()
model = tree.DecisionTreeClassifier()
model = model.fit(X_train, y_train)
graph = Source(tree.export_graphviz(model, out_file=None
, feature_names=X.columns, class_names=['default', 'no default']
, filled = True))
display(SVG(graph.pipe(format='svg')))
y_predict = model.predict(X_test)
print(confusion_matrix(y_test, y_predict))
print(accuracy_score(y_test, y_predict)*100)
# Random forest
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
print(confusion_matrix(y_test, y_predict))
warnings.filterwarnings('ignore')
print(accuracy_score(y_test, y_predict)*100)
Random forest yields higher accuracy compared to Decision trees 58.33 vs 62.577. Although False Negatives are similar for both the confusion matrices, Random forest model has significantly better False Positives there by increasing accuracy of the classification. Random forest is one of the best models in terms of classication. It achieves higher classification accuracy by eliminating overfitting by averaging multiple decision trees.